#pragma once
#include <cstddef>
void gmem_bw_ori(float *out, float *x, float y, cudaStream_t stream, size_t N);
void gmem_bw_opt1(float *out, float *x, float y, cudaStream_t stream, size_t N);